In [1]:
import numpy as np
import pandas as pd
import json
In [2]:
products = pd.read_csv('amazon_baby_subset.csv')
In [3]:
products = products.fillna({'review':''}) # fill in N/A's in the review column
def remove_punctuation(text):
import string
return text.translate(None, string.punctuation)
products['review_clean'] = products['review'].apply(remove_punctuation)
products.head(3)
Out[3]:
In [4]:
with open('important_words.json') as important_words_file:
important_words = json.load(important_words_file)
print important_words[:3]
In [5]:
for word in important_words:
products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
In [6]:
products['perfect'][:3]
Out[6]:
In [7]:
with open('module-4-assignment-train-idx.json') as train_data_file:
train_data_idx = json.load(train_data_file)
with open('module-4-assignment-validation-idx.json') as validation_data_file:
validation_data_idx = json.load(validation_data_file)
print train_data_idx[:3]
print validation_data_idx[:3]
In [8]:
print len(train_data_idx)
print len(validation_data_idx)
In [9]:
train_data = products.iloc[train_data_idx]
train_data.head(2)
Out[9]:
In [10]:
validation_data = products.iloc[validation_data_idx]
validation_data.head(2)
Out[10]:
Using the function given in #8 of Module 3 assignment or Programming Assignment 2
In [11]:
def get_numpy_data(dataframe, features, label):
dataframe['constant'] = 1
features = ['constant'] + features
features_frame = dataframe[features]
feature_matrix = features_frame.as_matrix()
label_sarray = dataframe[label]
label_array = label_sarray.as_matrix()
return(feature_matrix, label_array)
In [12]:
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment')
In [13]:
print feature_matrix_train.shape
print feature_matrix_valid.shape
In [14]:
'''
feature_matrix: N * D(intercept term included)
coefficients: D * 1
predictions: N * 1
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
# Take dot product of feature_matrix and coefficients
# YOUR CODE HERE
score = np.dot(feature_matrix, coefficients) # N * 1
# Compute P(y_i = +1 | x_i, w) using the link function
# YOUR CODE HERE
predictions = 1.0/(1+np.exp(-score))
# return predictions
return predictions
The function should do the following:
In [15]:
def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant):
# Compute the dot product of errors and feature
## YOUR CODE HERE
"""
errors: N * 1
feature: N * 1
derivative: 1
coefficient: 1
"""
derivative = np.dot(np.transpose(errors), feature)
# add L2 penalty term for any feature that isn't the intercept.
if not feature_is_constant:
## YOUR CODE HERE
derivative -= 2 * l2_penalty * coefficient
return derivative
In [16]:
def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
indicator = (sentiment==+1)
scores = np.dot(feature_matrix, coefficients)
# scores.shape (53072L, 1L)
# indicator.shape (53072L,)
# lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores))) - l2_penalty*np.sum(coefficients[1:]**2)
lp = np.sum((np.transpose(np.array([indicator]))-1)*scores - np.log(1. + np.exp(-scores))) - l2_penalty*np.sum(coefficients[1:]**2)
return lp
The function accepts the following parameters:
The function carries out the following steps:
In [17]:
# coefficients: D * 1
def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty, max_iter):
coefficients = np.array(initial_coefficients) # make sure it's a numpy array
for itr in xrange(max_iter):
# Predict P(y_i = +1|x_i,w) using your predict_probability() function
## YOUR CODE HERE
predictions = predict_probability(feature_matrix, coefficients)
# Compute indicator value for (y_i = +1)
indicator = (sentiment==+1)
# Compute the errors as indicator - predictions
errors = np.transpose(np.array([indicator])) - predictions
for j in xrange(len(coefficients)): # loop over each coefficient
is_intercept = (j == 0)
# Recall that feature_matrix[:,j] is the feature column associated with coefficients[j].
# Compute the derivative for coefficients[j]. Save it in a variable called derivative
## YOUR CODE HERE
derivative = feature_derivative_with_L2(errors, feature_matrix[:,j], coefficients[j], l2_penalty, is_intercept)
# add the step size times the derivative to the current coefficient
## YOUR CODE HERE
coefficients[j] += step_size*derivative
# Checking whether log likelihood is increasing
if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
print 'iteration %*d: log likelihood of observed labels = %.8f' % \
(int(np.ceil(np.log10(max_iter))), itr, lp)
return coefficients
Now that we have written up all the pieces needed for an L2 solver with logistic regression, let's explore the benefits of using L2 regularization while analyzing sentiment for product reviews. As iterations pass, the log likelihood should increase.
Let us train models with increasing amounts of regularization, starting with no L2 penalty, which is equivalent to our previous logistic regression implementation. Train 6 models with L2 penalty values 0, 4, 10, 1e2, 1e3, and 1e5. Use the following values for the other parameters:
Save the 6 sets of coefficients as coefficients_0_penalty, coefficients_4_penalty, coefficients_10_penalty, coefficients_1e2_penalty, coefficients_1e3_penalty, and coefficients_1e5_penalty respectively.
In [18]:
initial_coefficients = np.zeros((194,1))
step_size = 5e-6
max_iter = 501
In [19]:
coefficients_0_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 0, max_iter)
In [20]:
coefficients_4_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 4, max_iter)
In [21]:
coefficients_10_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 10, max_iter)
In [22]:
coefficients_1e2_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 1e2, max_iter)
In [23]:
coefficients_1e3_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 1e3, max_iter)
In [24]:
coefficients_1e5_penalty = logistic_regression_with_L2(feature_matrix_train , sentiment_train , initial_coefficients, step_size, 1e5, max_iter)
In [25]:
coefficients_0_penalty_without_intercept = list(coefficients_0_penalty[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients_0_penalty_without_intercept)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
In [26]:
positive_words = []
for i in range(5):
positive_words.append(word_coefficient_tuples[:5][i][0])
positive_words
Out[26]:
In [27]:
negative_words = []
for i in range(5):
negative_words.append(word_coefficient_tuples[-5:][i][0])
negative_words
Out[27]:
In [28]:
table = pd.DataFrame(data=[coefficients_0_penalty.flatten(), coefficients_4_penalty.flatten(), coefficients_10_penalty.flatten(), coefficients_1e2_penalty.flatten(), coefficients_1e3_penalty.flatten(), coefficients_1e5_penalty.flatten()],
index=[0, 4, 10, 100.0, 1000.0, 100000.0],
columns=['(intercept)'] + important_words)
In [29]:
table.head(2)
Out[29]:
In [30]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 6
def make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list):
cmap_positive = plt.get_cmap('Reds')
cmap_negative = plt.get_cmap('Blues')
xx = l2_penalty_list
plt.plot(xx, [0.]*len(xx), '--', lw=1, color='k')
table_positive_words = table[positive_words]
table_negative_words = table[negative_words]
#del table_positive_words['word']
#del table_negative_words['word']
for i, value in enumerate(positive_words):
color = cmap_positive(0.8*((i+1)/(len(positive_words)*1.2)+0.15))
plt.plot(xx, table_positive_words[value].as_matrix().flatten(),
'-', label=positive_words[i], linewidth=4.0, color=color)
for i, value in enumerate(negative_words):
color = cmap_negative(0.8*((i+1)/(len(negative_words)*1.2)+0.15))
plt.plot(xx, table_negative_words[value].as_matrix().flatten(),
'-', label=negative_words[i], linewidth=4.0, color=color)
plt.legend(loc='best', ncol=3, prop={'size':16}, columnspacing=0.5)
plt.axis([1, 1e5, -1, 2])
plt.title('Coefficient path')
plt.xlabel('L2 penalty ($\lambda$)')
plt.ylabel('Coefficient value')
plt.xscale('log')
plt.rcParams.update({'font.size': 18})
plt.tight_layout()
make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list=[0, 4, 10, 1e2, 1e3, 1e5])
In [31]:
"""
feature_matrix: N * D
coefficients: D * 1
predictions: N * 1
"""
training_accuracy = []
for coefficient in [coefficients_0_penalty, coefficients_4_penalty, coefficients_10_penalty, coefficients_1e2_penalty, coefficients_1e3_penalty, coefficients_1e5_penalty]:
predictions = predict_probability(feature_matrix_train, coefficient)
correct_num = np.sum((np.transpose(predictions.flatten())> 0.5) == (np.array(sentiment_train)>0))
total_num = len(sentiment_train)
#print "correct_num: {}, total_num: {}".format(correct_num, total_num)
training_accuracy.append(correct_num * 1./ total_num)
#print accuracy
l2_penalty_list=[x for x in range(6)]
plt.plot(l2_penalty_list, training_accuracy,'ro')
plt.title('training_accuracy')
plt.xlabel('L2 penalty ($\lambda$)')
plt.ylabel('training_accuracy')
plt.show()
In [32]:
[0, 4, 10, 1e2, 1e3, 1e5][training_accuracy.index(max(training_accuracy))]
Out[32]:
In [33]:
training_accuracy[training_accuracy.index(max(training_accuracy))]
Out[33]:
In [34]:
vali_accuracy = []
for coefficient in [coefficients_0_penalty, coefficients_4_penalty, coefficients_10_penalty, coefficients_1e2_penalty, coefficients_1e3_penalty, coefficients_1e5_penalty]:
predictions = predict_probability(feature_matrix_valid, coefficient)
correct_num = np.sum((np.transpose(predictions.flatten())> 0.5) == (np.array(sentiment_valid)>0))
total_num = len(sentiment_valid)
#print "correct_num: {}, total_num: {}".format(correct_num, total_num)
vali_accuracy.append(correct_num * 1./ total_num)
#print accuracy
l2_penalty_list=[x for x in range(6)]
plt.plot(l2_penalty_list, vali_accuracy,'ro')
plt.title('vali_accuracy')
plt.xlabel('L2 penalty ($\lambda$)')
plt.ylabel('vali_accuracy')
plt.show()
In [35]:
[0, 4, 10, 1e2, 1e3, 1e5][vali_accuracy.index(max(vali_accuracy))]
Out[35]:
In [36]:
vali_accuracy[vali_accuracy.index(max(vali_accuracy))]
Out[36]:
In [37]:
plt.plot(l2_penalty_list, training_accuracy,'g')
plt.plot(l2_penalty_list, vali_accuracy,'r')
Out[37]:
In [ ]: